rm(list = ls())

library(tidyverse)
library(haven) # for read_sas, read_dta
library(lfe)
library(fuzzyjoin)
library(sjmisc) #for row_sums
library(lubridate)
library(texreg)
select = dplyr::select

source("functions.R")

# Load military data
persons_raw = read_dta("data_prep/persons.dta")

# Rename variables and scale
persons = persons_raw %>% 
  mutate(age_pkoe1 = year(pkoe1_suorituspvm) - synvuosi,
         age_pkoe2 = year(pkoe2_suorituspvm) - synvuosi,
         lie = T1_1,
         validity = T1_2,
         fix = T1_3,
         psyko = T1_4,
         hypokondria = T1_5,
         psykastenia = T1_6,
         schizo = T1_7,
         leader = T2_1,
         energy = T2_2,
         achieve = T2_3,
         conf = T2_4,
         delib = T2_5,
         social = T2_6,
         duty = T2_7,
         masc = T2_8,
         visuos = kuv,
         words = san,
         math = las,
         gpa = ka,
         gpa_t = lka) %>% 
  group_by(synvuosi) %>% 
  mutate_at(vars(lie:gpa_t), myScale) %>% 
  ungroup

# Create new variables and select
persons = persons %>% 
  row_means(energy, social, n = 2, var = "extro") %>% 
  row_means(energy, social, conf, leader, achieve, n = 5, var = "extro_factor") %>%
  row_means(energy, social, conf, leader, n = 4, var = "extro_deming") %>%
  row_means(energy, social, masc, n = 3, var = "jock_mean") %>% 
  row_means(duty, delib, achieve, n = 3, var = "consci") %>% 
  row_means(duty, delib, n = 2, var = "consci_factor") %>% 
  row_means(duty, delib, achieve, leader, conf, n = 5, var = "nerd_mean") %>% 
  row_means(leader, conf, n = 2, var = "leader_conf") %>% 
  row_means(energy, social, duty, delib, achieve, leader, conf, n = 7, var = "noncog") %>% 
  row_means(visuos, words, math, n = 3, var = "cog") %>% 
  select(shnro, synvuosi, age_pkoe1, age_pkoe2, aidin_tutnro, johted, lie:cog)

# Create terciles for traits
persons = persons %>% group_by(synvuosi) %>% 
  mutate(tercileExtro = ntile(extro, 3),
         tercileCog = ntile(cog, 3),
         tercileNoncog = ntile(noncog, 3),
         tercileConsci = ntile(consci, 3)) %>% 
  mutate(cog_rank = percent_rank(cog),
         noncog_rank = percent_rank(noncog),
         extro_rank = percent_rank(extro),
         consci_rank = percent_rank(consci)) %>% ungroup
  

# Create brother sample ---------------------------------------------------

brothers = persons %>% 
  filter(aidin_tutnro != "") %>% 
  arrange(aidin_tutnro, synvuosi) %>% 
  group_by(aidin_tutnro) %>% 
  mutate(shnro_bro = case_when( # choose brother closest in age or in case of ties, the older one
    lead(synvuosi) - synvuosi < synvuosi - lag(synvuosi) | is.na(lag(synvuosi)) ~ lead(shnro),
    lead(synvuosi) - synvuosi >= synvuosi - lag(synvuosi) | is.na(lead(synvuosi)) ~ lag(shnro))) %>% 
  ungroup

brothers = brothers %>% 
  filter(!is.na(shnro_bro)) %>% 
  select(shnro, shnro_bro)

brothers = brothers %>% 
  left_join(rename_with(persons, ~paste0(., "_bro")), by = c("shnro_bro" = "shnro_bro"))
  
persons = persons %>% 
  left_join(brothers, by = c("shnro" = "shnro"))


# Clean matriculation exam data  ---------------------------------------------------------

# Read cleaned exam data
load("./data_input/ytl_1989_2013_no_duplicates.Rdata")

# Add school id
ytl = ytl %>% arrange(ytl_vuosi_kausi) %>% group_by(shnro) %>% mutate(koulu = ytl_sotunnus[1]) %>% ungroup

# Make wide
ytl_wide = ytl %>% 
  mutate(ytl_arvosana_group = replace(ytl_arvosana_group, ytl_arvosana_group == "E", "L")) %>% 
  # Important to select only variables that get a single value per individual
  select(shnro, ytl_aine, ytl_arvosana_group, ytl_z, ytl_mode_vuosi, koulu) %>% 
  # Make wide
  rename(grade = ytl_arvosana_group) %>% 
  rename(standard = ytl_z) %>% 
  gather(variable, value, grade, standard) %>% # First make separate row for every variable with multiple values per shrno
  unite(temp, ytl_aine, variable) %>% # Then combine the name of the field with the variable name
  spread(temp, value) %>%  # The spread by the this combined name
  select(shnro, koulu, ytl_mode_vuosi, matches("^M_|^A_|^N_|^R_|^EA_|^S_|^BB_"))

# Add dummy variables and impute -1 for NA points
ytl_wide = ytl_wide %>% 
  mutate(Has_EA = as.numeric(!is.na(EA_grade)),
         Has_A = as.numeric(!is.na(A_grade)), 
         Has_M = as.numeric(!is.na(M_grade)), 
         Has_N = as.numeric(!is.na(N_grade)), 
         Has_R = as.numeric(!is.na(R_grade))) %>% 
  mutate_at(vars(matches("_grade")), fct_explicit_na) %>% 
  mutate_at(vars(matches("_standard")), as.numeric)

# Add mother tongue year and exam season
kausi = ytl %>% filter(ytl_aine == "A") %>% 
  mutate(A_vuosi = ytl_vuosi, 
         A_kausi = ytl_kausi) %>% 
  select(shnro, A_vuosi, A_kausi)

ytl_wide = ytl_wide %>% left_join(kausi)
#save(ytl_wide, file = 'data_output/ytl_wide.Rdata')

persons = persons %>% 
  left_join(ytl_wide, by = c("shnro" = "shnro")) 



# Add Math anchor ---------------------------------------------------------

# Prepare data for dimension reduction (anchoring and personality bundling)
dta_math = persons %>% 
  filter(Has_A == 1,
         Has_M + Has_N != 2) %>% 
  mutate(Has_EA = as.numeric(!is.na(EA_standard)), # Redefine Has_ as lacking score instead of grade
         Has_M = as.numeric(!is.na(M_standard)), 
         Has_N = as.numeric(!is.na(N_standard)), 
         Has_R = as.numeric(!is.na(R_standard))) %>% 
  tidyr::replace_na(list(M_standard = 0, R_standard = 0, N_standard = 0, EA_standard = 0)) %>% 
  mutate(M = Has_M*M_standard, 
         N = Has_N*N_standard)

# Important that these match (also Has_N)
dta_math %>% count(Has_M, M_standard == 0)
# Only valid if either M or N is taken
dta_math %>% count(Has_M, Has_N)

# Single anchor for all
anchor = felm(math ~ Has_M + M_standard:Has_M + Has_N + N_standard:Has_N | ytl_mode_vuosi, data = dta_math)
anchor$coefficients

texreg(anchor,
       file = "tables/math_anchor.tex",
       reorder.coef = c(2,1,4,3),
       custom.header = list("Dependent variable: Military Math" = 1),
       custom.model.names = c("(1)"),
       custom.coef.names = c("\\delta_2", "\\delta_1", "\\delta_4", "\\delta_3"),
       caption = "Math Anchoring Regressions",
       label = "tab:math_anchor",
       #custom.note = "",
       caption.above = T,
       stars = numeric(0),
       digits = 3,
       float.pos = "h!",
       threeparttable = T,
       dcolumn = T,
       booktabs = T,
       include.rsquared = F,
       include.rmse = F,
       include.ci = F)

# Anchor is the fitted value (only if no missing values in dta). Standardise.
dta_math = dta_math %>%
  mutate(math_anchored = anchor$fitted.values) %>% 
  group_by(synvuosi) %>% 
  mutate(math_anchored = myScale(math_anchored)) %>% 
  ungroup %>% 
  select(shnro, math_anchored)

persons = persons %>% left_join(dta_math)



# Add new variables -------------------------------------------------------

# Add high school gpa and high school indicator
persons = persons %>% 
  row_means(A_standard, M_standard, N_standard, R_standard, S_standard, EA_standard, BB_standard,
            n = 3,
            var = "score") %>% 
  mutate(gym = shnro %in% ytl_wide$shnro)

# Standardise new variables
persons = persons %>% 
  group_by(synvuosi) %>% 
  mutate_at(vars(extro:cog), myScale) %>% 
  mutate_at((vars(matches("_standard"), math_anchored, score)), myScale) %>% 
  ungroup 


# Save
save(persons, file = 'data_output/ytl_persons_brothers.Rdata')
